Earthquake Prediction Project¶

Importing the necessary libraries¶

In [30]:
import numpy as np
import pandas as pd
import requests
from sklearn import preprocessing
import matplotlib.pyplot as plt
import seaborn as sns
from pandas.plotting import scatter_matrix
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
import time

Linking the csv dataset file¶

Dataset Link - https://drive.google.com/drive/folders/1b4IyWfUzbDgOvuW_W7rOSEeTpBN53Mlf

In [31]:
df = pd.read_csv(r'C:\Users\arjun\OneDrive\Desktop\2023 Sept Fall 3rd sem\Predictive Analytics\Earthquake_dataset_Arjun\earthquake1.csv')
In [32]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24007 entries, 0 to 24006
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   id         24007 non-null  float64
 1   date       24007 non-null  object 
 2   time       24007 non-null  object 
 3   lat        24007 non-null  float64
 4   long       24007 non-null  float64
 5   country    24007 non-null  object 
 6   city       11754 non-null  object 
 7   area       12977 non-null  object 
 8   direction  10062 non-null  object 
 9   dist       10062 non-null  float64
 10  depth      24007 non-null  float64
 11  xm         24007 non-null  float64
 12  md         24007 non-null  float64
 13  richter    24007 non-null  float64
 14  mw         5003 non-null   float64
 15  ms         24007 non-null  float64
 16  mb         24007 non-null  float64
dtypes: float64(11), object(6)
memory usage: 3.1+ MB
In [33]:
df.describe()
Out[33]:
id lat long dist depth xm md richter mw ms mb
count 2.400700e+04 24007.000000 24007.000000 10062.000000 24007.000000 24007.000000 24007.000000 24007.000000 5003.000000 24007.000000 24007.000000
mean 1.991982e+13 37.929474 30.773229 3.175015 18.491773 4.056038 1.912346 2.196826 4.478973 0.677677 1.690561
std 2.060396e+11 2.205605 6.584596 4.715461 23.218553 0.574085 2.059780 2.081417 1.048085 1.675708 2.146108
min 1.910000e+13 29.740000 18.340000 0.100000 0.000000 3.500000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 1.980000e+13 36.190000 26.195000 1.400000 5.000000 3.600000 0.000000 0.000000 4.100000 0.000000 0.000000
50% 2.000000e+13 38.200000 28.350000 2.300000 10.000000 3.900000 0.000000 3.500000 4.700000 0.000000 0.000000
75% 2.010000e+13 39.360000 33.855000 3.600000 22.400000 4.400000 3.800000 4.000000 5.000000 0.000000 4.100000
max 2.020000e+13 46.350000 48.000000 95.400000 225.000000 7.900000 7.400000 7.200000 7.700000 7.900000 7.100000
In [34]:
df.shape
Out[34]:
(24007, 17)
In [35]:
df.head()
Out[35]:
id date time lat long country city area direction dist depth xm md richter mw ms mb
0 2.000000e+13 2003.05.20 12:17:44 AM 39.04 40.38 turkey bingol baliklicay west 0.1 10.0 4.1 4.1 0.0 NaN 0.0 0.0
1 2.010000e+13 2007.08.01 12:03:08 AM 40.79 30.09 turkey kocaeli bayraktar_izmit west 0.1 5.2 4.0 3.8 4.0 NaN 0.0 0.0
2 1.980000e+13 1978.05.07 12:41:37 AM 38.58 27.61 turkey manisa hamzabeyli south_west 0.1 0.0 3.7 0.0 0.0 NaN 0.0 3.7
3 2.000000e+13 1997.03.22 12:31:45 AM 39.47 36.44 turkey sivas kahvepinar_sarkisla south_west 0.1 10.0 3.5 3.5 0.0 NaN 0.0 0.0
4 2.000000e+13 2000.04.02 12:57:38 AM 40.80 30.24 turkey sakarya meseli_serdivan south_west 0.1 7.0 4.3 4.3 0.0 NaN 0.0 0.0
In [36]:
df.columns
Out[36]:
Index(['id', 'date', 'time', 'lat', 'long', 'country', 'city', 'area',
       'direction', 'dist', 'depth', 'xm', 'md', 'richter', 'mw', 'ms', 'mb'],
      dtype='object')

Data Preprocessing

In [37]:
df = df.drop('id',axis=1)
In [38]:
import datetime
import time
import pandas as pd

timestamp = []

for d, t in zip(df['date'], df['time']):
    try:
        ts = datetime.datetime.strptime(d+' '+t, '%Y.%m.%d %I:%M:%S %p')
        timestamp.append(time.mktime(ts.timetuple()))
    except OverflowError:
        timestamp.append(None)  # or any other value that indicates an error

timeStamp = pd.Series(timestamp)
df['Timestamp'] = timeStamp.values
final_data = df.drop(['date', 'time'], axis=1)
df = final_data
df = df.dropna(subset=['Timestamp'])  # Drop rows with invalid timestamps
df.head()
Out[38]:
lat long country city area direction dist depth xm md richter mw ms mb Timestamp
0 39.04 40.38 turkey bingol baliklicay west 0.1 10.0 4.1 4.1 0.0 NaN 0.0 0.0 1.053401e+09
1 40.79 30.09 turkey kocaeli bayraktar_izmit west 0.1 5.2 4.0 3.8 4.0 NaN 0.0 0.0 1.185937e+09
2 38.58 27.61 turkey manisa hamzabeyli south_west 0.1 0.0 3.7 0.0 0.0 NaN 0.0 3.7 2.633605e+08
3 39.47 36.44 turkey sivas kahvepinar_sarkisla south_west 0.1 10.0 3.5 3.5 0.0 NaN 0.0 0.0 8.590015e+08
4 40.80 30.24 turkey sakarya meseli_serdivan south_west 0.1 7.0 4.3 4.3 0.0 NaN 0.0 0.0 9.546479e+08
In [39]:
df.dtypes
Out[39]:
lat          float64
long         float64
country       object
city          object
area          object
direction     object
dist         float64
depth        float64
xm           float64
md           float64
richter      float64
mw           float64
ms           float64
mb           float64
Timestamp    float64
dtype: object
In [12]:
# Data Encoding
label_encoder = preprocessing.LabelEncoder() 
for col in df.columns:
    if df[col].dtype == 'object':
      label_encoder.fit(df[col])
      df[col] = label_encoder.transform(df[col])
df.dtypes
C:\Users\arjun\AppData\Local\Temp\ipykernel_18248\1335032553.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = label_encoder.transform(df[col])
C:\Users\arjun\AppData\Local\Temp\ipykernel_18248\1335032553.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = label_encoder.transform(df[col])
C:\Users\arjun\AppData\Local\Temp\ipykernel_18248\1335032553.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = label_encoder.transform(df[col])
C:\Users\arjun\AppData\Local\Temp\ipykernel_18248\1335032553.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = label_encoder.transform(df[col])
Out[12]:
lat          float64
long         float64
country        int32
city           int32
area           int32
direction      int32
dist         float64
depth        float64
xm           float64
md           float64
richter      float64
mw           float64
ms           float64
mb           float64
Timestamp    float64
dtype: object
In [13]:
df.isnull().sum()
Out[13]:
lat              0
long             0
country          0
city             0
area             0
direction        0
dist         12250
depth            0
xm               0
md               0
richter          0
mw           18681
ms               0
mb               0
Timestamp        0
dtype: int64
In [14]:
# Imputing Missing Values with Mean
si=SimpleImputer(missing_values = np.nan, strategy="mean")
si.fit(df[["dist","mw"]])
df[["dist","mw"]] = si.transform(df[["dist","mw"]])
df.isnull().sum()
C:\Users\arjun\AppData\Local\Temp\ipykernel_18248\1255669323.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[["dist","mw"]] = si.transform(df[["dist","mw"]])
Out[14]:
lat          0
long         0
country      0
city         0
area         0
direction    0
dist         0
depth        0
xm           0
md           0
richter      0
mw           0
ms           0
mb           0
Timestamp    0
dtype: int64

Data Visualization

In [15]:
import plotly.express as px
px.scatter(df, x='richter',y='xm', color="direction")
In [16]:
plt.figure(figsize=(7,7))
sns.histplot(data=df, x='depth', hue='direction',palette = 'Accent')
plt.show()
No description has been provided for this image
In [17]:
plt.figure(figsize=(7,7))
df[['lat','long']].hist()
plt.show()
<Figure size 700x700 with 0 Axes>
No description has been provided for this image
In [19]:
plt.figure(figsize=(10,10))
sns.distplot(df.xm)
C:\Users\arjun\AppData\Local\Temp\ipykernel_18248\3879995472.py:2: UserWarning:



`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751


Out[19]:
<Axes: xlabel='xm', ylabel='Density'>
No description has been provided for this image
In [20]:
plt.figure(figsize=(15,10))
sns.barplot(x=df['xm'], y=df['ms'])
plt.xlabel('xm')
plt.ylabel('ms')
Out[20]:
Text(0, 0.5, 'ms')
No description has been provided for this image
In [21]:
plt.scatter(df.depth, df.xm)
plt.xlabel("Depth")
plt.ylabel("xm")
plt.show()
No description has been provided for this image
In [22]:
plt.scatter(df.depth, df.mb)
plt.xlabel("Depth")
plt.ylabel("Magnitude body")
plt.show()
No description has been provided for this image
In [23]:
plt.scatter(df.dist, df.depth)
plt.xlabel("Area affected")
plt.ylabel("Depth")
plt.show()
No description has been provided for this image
In [24]:
plt.scatter(df.dist, df.depth)
plt.xlabel("Area affected")
plt.ylabel("xm")
plt.show()
No description has been provided for this image

Correlation between Attributes

In [25]:
most_correlated = df.corr()['xm'].sort_values(ascending=False)
most_correlated
Out[25]:
xm           1.000000
mb           0.520944
ms           0.466182
richter      0.279865
mw           0.279404
depth        0.248506
area         0.111337
city         0.104095
long         0.091117
direction    0.085171
dist         0.001002
lat         -0.043337
country     -0.045640
md          -0.101287
Timestamp   -0.194235
Name: xm, dtype: float64
In [26]:
plt.figure(figsize=(20,20))
dataplot=sns.heatmap(df.corr(),annot=True)
plt.show()
No description has been provided for this image

Normalization of data

In [27]:
# Using MinMaxScaler
scaler = preprocessing.MinMaxScaler()
d = scaler.fit_transform(df)
df = pd.DataFrame(d, columns=df.columns)
df.head()
Out[27]:
lat long country city area direction dist depth xm md richter mw ms mb Timestamp
0 0.559904 0.743088 0.76 0.173913 0.116178 0.875 0.0 0.048077 0.150 0.554054 0.000000 0.553294 0.0 0.000000 0.701575
1 0.665262 0.396156 0.76 0.608696 0.132893 0.875 0.0 0.025000 0.125 0.513514 0.571429 0.553294 0.0 0.000000 0.789847
2 0.532210 0.312542 0.76 0.673913 0.459761 0.750 0.0 0.000000 0.050 0.000000 0.000000 0.553294 0.0 0.544118 0.175393
3 0.585792 0.610249 0.76 0.869565 0.511762 0.750 0.0 0.048077 0.000 0.472973 0.000000 0.553294 0.0 0.000000 0.572101
4 0.665864 0.401214 0.76 0.804348 0.688609 0.750 0.0 0.033654 0.200 0.581081 0.000000 0.553294 0.0 0.000000 0.635804

Splitting the Dataset

In [28]:
y=np.array(df['xm']) 
X=np.array(df.drop('xm',axis=1))
from sklearn.model_selection import train_test_split  
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=2) 

Creating Models

  1. Linear Regression
In [425]:
from sklearn.linear_model import LinearRegression 
start1 = time.time() 
linear=LinearRegression() 
linear.fit(X_train,y_train) 
ans1 = linear.predict(X_test)
end1 = time.time()
t1 = end1-start1
In [426]:
accuracy1=linear.score(X_test,y_test) 
print("Accuracy of Linear Regression model is:",accuracy1)
Accuracy of Linear Regression model is: 0.63134131503029
In [427]:
from sklearn import metrics
print("Linear Regression")
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, ans1))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, ans1))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, ans1)))
Linear Regression
Mean Absolute Error: 0.05878246463205686
Mean Squared Error: 0.00625827169726636
Root Mean Squared Error: 0.07910923901331854
In [428]:
plt.plot(y_test, ans1, 'o') 
m, b = np.polyfit(y_test,ans1, 1) 
plt.plot(y_test, m*y_test + b) 
plt.xlabel("Actual Magnitude")
plt.ylabel("Predicted Magnitude")
Out[428]:
Text(0, 0.5, 'Predicted Magnitude')
No description has been provided for this image
  1. Decision Tree
In [449]:
from sklearn.tree import DecisionTreeRegressor
start2 = time.time() 
regressor = DecisionTreeRegressor(random_state = 40) 
regressor.fit(X_train,y_train)
ans2 = regressor.predict(X_test)
end2 = time.time()
t2 = end2-start2
In [450]:
accuracy2=regressor.score(X_test,y_test) 
print("Accuracy of Decision Tree model is:",accuracy2)
Accuracy of Decision Tree model is: 0.9932960893884235
In [451]:
print("Decision Tree")
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, ans2))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, ans2))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, ans2)))
Decision Tree
Mean Absolute Error: 0.0006909999621372331
Mean Squared Error: 0.00011380416561969702
Root Mean Squared Error: 0.010667903525046383
  1. KNN Model
In [432]:
from sklearn.neighbors import KNeighborsRegressor
start3 = time.time() 
knn = KNeighborsRegressor(n_neighbors=6)
knn.fit(X_train, y_train)
ans3 = knn.predict(X_test)
end3 = time.time()
t3 = end3-start3
In [433]:
accuracy3=knn.score(X_test,y_test) 
print("Accuracy of KNN model is:",accuracy3)
Accuracy of KNN model is: 0.8457466919393031
In [434]:
print("KNN Model")
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, ans3))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, ans3))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, ans3)))
KNN Model
Mean Absolute Error: 0.03305598677318794
Mean Squared Error: 0.002618571462992348
Root Mean Squared Error: 0.051171979275696854
In [435]:
import random
info = {}
for i in range(10):
  k = random.randint(2,10) 
  startk = time.time()
  knn = KNeighborsRegressor(n_neighbors=k)
  knn.fit(X_train, y_train)
  ans3 = knn.predict(X_test)
  endk = time.time()
  tk = endk-startk
  acc3=knn.score(X_test,y_test) 
  info[k] = [acc3,tk]

for i in info:
  print("for k =",i,": accuracy =",info[i][0])
for k = 4 : accuracy = 0.8559118607470738
for k = 9 : accuracy = 0.8334625255508568
for k = 8 : accuracy = 0.8384577534478264
for k = 6 : accuracy = 0.8457466919393031
for k = 5 : accuracy = 0.8519381145638621
for k = 10 : accuracy = 0.8296048410841246
for k = 7 : accuracy = 0.8425261199362686
In [436]:
x = list(info.keys())
yacc = []
for i in info:
  yacc.append(info[i][0])
plt.plot(x, yacc, 'o', color='black');
plt.xlabel("k value")
plt.ylabel("accuracy");
plt.title("Accuracy for different values of k")
Out[436]:
Text(0.5, 1.0, 'Accuracy for different values of k')
No description has been provided for this image
In [437]:
yt = []
for i in info:
  yt.append(info[i][1])
plt.plot(x, yt, 'o', color='black');
plt.xlabel("k value")
plt.ylabel("execution time");
plt.title("Execution time for different values of k")
Out[437]:
Text(0.5, 1.0, 'Execution time for different values of k')
No description has been provided for this image

Comparison Graphs

  1. Accuracy
In [454]:
models = ["linear regression","decision tree","knn"]
accuracies = [accuracy1,accuracy2,accuracy3]
In [455]:
plt.bar(models, accuracies, color ='maroon',
        width = 0.25)
plt.xlabel("Models")
plt.ylabel("Accuracies")
plt.title("Accuracy Comparison Graph")
Out[455]:
Text(0.5, 1.0, 'Accuracy Comparison Graph')
No description has been provided for this image
  1. Execution Time
In [456]:
times = [t1,t2,t3]
plt.bar(models, times, color ='maroon',
        width = 0.25)
plt.xlabel("Models")
plt.ylabel("Execution Time")
plt.title("Execution Time Comparison Graph")
Out[456]:
Text(0.5, 1.0, 'Execution Time Comparison Graph')
No description has been provided for this image
In [ ]: